/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.structure.test.io;
import org.biojava.nbio.structure.*;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.io.FileParsingParameters;
import org.biojava.nbio.structure.io.LocalPDBDirectory.ObsoleteBehavior;
import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
import org.biojava.nbio.structure.xtal.CrystalCell;
import org.junit.After;
import org.junit.BeforeClass;
import org.junit.ComparisonFailure;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.vecmath.Matrix4d;
import java.io.*;
import java.util.*;
import static org.junit.Assert.*;
/**
* A test to make sure both PDB and mmCIF parsers can parse
* properly large samples of the PDB.
*
* Will take very long to run, thus they are excluded by default in the pom.
* To run them use, for the 1000 entries one:
* <pre>
* mvn -Dtest=TestLongPdbVsMmCifParsing#testLongPdbVsMmCif test
* </pre>
* or for the 10000 entries:
* <pre>
* mvn -Dtest=TestLongPdbVsMmCifParsing#testVeryLongPdbVsMmCif test
* </pre>
*
*
* @author duarte_j
*
*/
public class TestLongPdbVsMmCifParsing {
private static final Logger logger = LoggerFactory.getLogger(TestLongPdbVsMmCifParsing.class);
private static final String TEST_LARGE_SET_FILE = "/random_1000_set.list";
private static final String TEST_VERY_LARGE_SET_FILE = "/random_10000_set.list";
private static final int DOTS_PER_LINE = 100;
private static final float DELTA = 0.01f;
private static final float DELTA_RESOLUTION = 0.01f;
private static final float DELTA_RFREE = 0.01f;
/**
* The maximum number of PDBs for which we allow a mismatch of mol_ids (entity_ids) between PDB and mmCIF files
* If more mismatches than this, the test will fail.
* As of 2014.12.04 there are 7 mismatches
*/
private static final int MAX_ALLOWED_MOL_ID_MISMATCHES = 10;
private static AtomCache cache;
private static FileParsingParameters params;
private String pdbId;
private int countTested = 0;
private HashSet<String> pdbIdsWithMismatchingMolIds;
@BeforeClass
public static void setUpBeforeClass() {
cache = new AtomCache();
System.out.println("##### Starting long test. THIS CAN TAKE UP TO 1 HOUR TO COMPLETE!");
System.out.println("##### Using PDB/mmCIF cache dir: "+cache.getPath());
System.out.println("##### Each dot is a PDB entry being tested. "+DOTS_PER_LINE+" dots per line");
// disallow the use of the default /tmp dir, to make sure PDB_DIR is set
if (cache.getPath().equals(System.getProperty("java.io.tmpdir")) ||
(cache.getPath().equals(System.getProperty("java.io.tmpdir")+File.separator)) ) {
throw new IllegalArgumentException("PDB_DIR has not been set or it is set to the default temp directory. Please set PDB_DIR to run this test");
};
params = new FileParsingParameters();
cache.setFileParsingParams(params);
cache.setObsoleteBehavior(ObsoleteBehavior.THROW_EXCEPTION);
}
@Test
public void testLongPdbVsMmCif() throws IOException, StructureException {
List<String> pdbIds = readTestSetFile(TEST_LARGE_SET_FILE);
testAll(pdbIds);
}
@Test
public void testVeryLongPdbVsMmCif() throws IOException, StructureException {
List<String> pdbIds = readTestSetFile(TEST_VERY_LARGE_SET_FILE);
testAll(pdbIds);
}
@Test
public void testSingle() throws IOException, StructureException {
testAll(Arrays.asList("4kro"));
}
@After
public void printInfo() {
if (pdbId!=null)
System.out.println("\n##### ----> Last tested PDB entry was: "+pdbId + " ("+ countTested + " done so far)");
}
private void testAll(List<String> pdbIds) throws IOException, StructureException {
pdbIdsWithMismatchingMolIds = new HashSet<String>();
long start = System.currentTimeMillis();
System.out.println("##### Total of "+pdbIds.size()+" PDB entries to test");
for (int i = 0; i<pdbIds.size(); i++) {
pdbId = pdbIds.get(i);
countTested = i + 1;
System.out.print(".");
testSingleEntry(pdbId);
if ( ( (i+1)%DOTS_PER_LINE )==0 ) System.out.println();
}
pdbId = null; // to avoid printing the message if tests pass for all PDB entries
long end = System.currentTimeMillis();
checkWarnings();
System.out.printf("\nDone in %5.1f minutes\n", (end-start)/60000.0);
}
private void checkWarnings() {
if (pdbIdsWithMismatchingMolIds.size()>0)
System.out.println("A total of "+pdbIdsWithMismatchingMolIds.size()+" PDB entries have mismatches in their Compound mol_ids (entity_ids)");
assertTrue("Mismatching mol_id (entity_id) between pdb and cif above the maximum allowed ("+MAX_ALLOWED_MOL_ID_MISMATCHES+")",
pdbIdsWithMismatchingMolIds.size()<MAX_ALLOWED_MOL_ID_MISMATCHES);
}
private void testSingleEntry(String pdbId) throws IOException, StructureException {
Structure sCif = getCifStructure(pdbId);
Structure sPdb = getPdbStructure(pdbId);
assertNotNull(sCif);
assertNotNull(sPdb);
try {
testStructureMethods(sPdb, sCif);
testHeader(sPdb, sCif);
testChains(sPdb, sCif);
} catch (ComparisonFailure e) {
System.out.println("\nComparison failure! Values follow:");
System.out.println("Actual : "+e.getActual());
System.out.println("Expected: "+e.getExpected());
throw e;
}
}
private void testStructureMethods(Structure sPdb, Structure sCif) {
assertEquals("failed isNmr:",sPdb.isNmr(), sCif.isNmr());
assertEquals("failed isCrystallographic:",sPdb.isCrystallographic(), sCif.isCrystallographic());
assertEquals("failed nrModels:",sPdb.nrModels(), sCif.nrModels());
assertEquals("failed for getPdbCode:",sPdb.getPDBCode(),sCif.getPDBCode());
assertFalse(sPdb.isBiologicalAssembly());
assertFalse(sCif.isBiologicalAssembly());
// TODO journal article not parsed in mmCIF parser
//assertEquals("failed hasJournalArticle",sPdb.hasJournalArticle(),sCif.hasJournalArticle());
// entity type should always be present
for (EntityInfo e: sPdb.getEntityInfos()) {
assertNotNull(e.getType());
}
for (EntityInfo e: sCif.getEntityInfos()) {
assertNotNull(e.getType());
}
// entities: there's quite some inconsistencies here between pdb and cif:
// sugar polymers are not in pdb at all: we avoid them
boolean canCompareEntityCounts = true;
for (EntityInfo e:sCif.getEntityInfos()) {
if (e.getDescription().contains("SUGAR")) canCompareEntityCounts = false;
}
if (canCompareEntityCounts) {
int entCountCif = 0;
for (EntityInfo e: sCif.getEntityInfos()) {
if (e.getType() == EntityType.POLYMER)
entCountCif++;
}
int entCountPdb = 0;
for (EntityInfo e:sPdb.getEntityInfos()) {
if (e.getType() == EntityType.POLYMER)
entCountPdb++;
}
assertEquals("failed number of non-sugar polymeric Entities pdb vs cif", entCountPdb, entCountCif);
}
// ss bonds
// 4ab9 contains an error in ssbond in pdb file (misses 1 ssbond)
// 2bdi contains also errors, the counts in both differ a lot 80 vs 92
if (!sPdb.getPDBCode().equals("4AB9") && !sPdb.getPDBCode().equals("2BDI"))
assertEquals("number of ss bonds should coincide pdb vs cif", sPdb.getSSBonds().size(), sCif.getSSBonds().size());
}
private void testHeader(Structure sPdb, Structure sCif) {
PDBHeader hPdb = sPdb.getPDBHeader();
PDBHeader hCif = sCif.getPDBHeader();
boolean isNmr = sPdb.isNmr();
boolean isCrystallographic = sPdb.isCrystallographic();
assertNotNull(hPdb);
assertNotNull(hCif);
assertEquals("failed for PDB id (getIdCode)",hPdb.getIdCode(),hCif.getIdCode());
assertNotNull("pdb authors null",hPdb.getAuthors());
assertNotNull("cif authors null",hCif.getAuthors());
// I suppose 2 is a safe bet for authors length...
assertTrue("authors length should be at least 2",hPdb.getAuthors().length()>=2);
// for authors we strip spaces in case of ambiguities with names
// there's too much variability in authors, commenting out, e.g. for 1zjo they don't coincide
//assertEquals("failed getAuthors:",
// hPdb.getAuthors().toLowerCase().replaceAll(" ", ""),
// hCif.getAuthors().toLowerCase().replaceAll(" ", ""));
assertNotNull("pdb classification null in pdb",hPdb.getClassification());
assertNotNull("cif classification null in cif",hCif.getClassification());
// there's too much variability in classification between pdb and mmcif, e.g. in 3ofb they don't coincide
//assertEquals("failed getClassification:",hPdb.getClassification().toLowerCase(), hCif.getClassification().toLowerCase());
// description is set in CIF parser to same as classification (_struct_keywords.pdbx_keywords field)
// while in PDB parser it is simply not set
//assertNotNull("pdb description null",hPdb.getDescription());
assertNotNull("cif description null",hCif.getDescription());
//assertEquals("failed getDescription:",hPdb.getDescription().toLowerCase(), hCif.getDescription().toLowerCase());
assertEquals("failed getDepDate:",hPdb.getDepDate(), hCif.getDepDate());
assertEquals("failed getModDate:",hPdb.getModDate(), hCif.getModDate());
assertNotNull(hPdb.getExperimentalTechniques());
assertNotNull(hCif.getExperimentalTechniques());
assertTrue(hPdb.getExperimentalTechniques().size()>0);
assertEquals("failed for getExperimentalTechniques",hPdb.getExperimentalTechniques(),hCif.getExperimentalTechniques());
// for some Electron Microscopy/Crystallography entries (e.g. 3iz2) the resolution in mmCIF is not present in the usual place
if (!hPdb.getExperimentalTechniques().contains(ExperimentalTechnique.ELECTRON_CRYSTALLOGRAPHY) &&
!hPdb.getExperimentalTechniques().contains(ExperimentalTechnique.ELECTRON_MICROSCOPY)) {
assertEquals("failed getResolution:",hPdb.getResolution(), hCif.getResolution(), DELTA_RESOLUTION);
}
// JRNL record is sometimes missing (e.g. 21bi) and thus is null, we can't test for nulls here in the general case
//assertNotNull("journal article null",hPdb.getJournalArticle());
// TODO journal article not parsed in mmCIF parser
// TODO when fixed in mmCIF parser, compare PDB to mmCIF values if not null
//assertNotNull("journal article null",hCif.getJournalArticle());
assertNotNull("title null in pdb",hPdb.getTitle());
assertNotNull("title null in cif",hCif.getTitle());
// for titles we strip spaces in case of ambiguities with spacing
assertEquals("failed for getTitle",
hPdb.getTitle().toLowerCase().replaceAll(" ", ""),
hCif.getTitle().toLowerCase().replaceAll(" ", ""));
// tests specific to experimental techniques
if (isNmr) {
assertEquals("resolution is not the default value in NMR structure",
PDBHeader.DEFAULT_RESOLUTION, hPdb.getResolution(), DELTA_RESOLUTION);
}
if (!isCrystallographic) {
assertEquals("rfree is not the default value in non-crystallographic structure in pdb",
PDBHeader.DEFAULT_RFREE, DELTA_RFREE, hPdb.getRfree());
assertEquals("rfree is not the default value in non-crystallographic structure in cif",
PDBHeader.DEFAULT_RFREE, DELTA_RFREE, hCif.getRfree());
}
if (isCrystallographic) {
assertEquals("failed for Rfree:",hPdb.getRfree(), hCif.getRfree(), DELTA_RFREE);
assertNotNull("getCrystallographicInfo is null in pdb",hPdb.getCrystallographicInfo());
assertNotNull("getCrystallographicInfo is null in cif",hCif.getCrystallographicInfo());
PDBCrystallographicInfo ciPdb = hPdb.getCrystallographicInfo();
PDBCrystallographicInfo ciCif = hCif.getCrystallographicInfo();
assertNotNull("space group null in pdb", ciPdb.getSpaceGroup());
assertNotNull("space group null in cif", ciCif.getSpaceGroup());
assertNotNull("crystal cell null in pdb",ciPdb.getCrystalCell());
assertNotNull("crystal cell null in cif",ciCif.getCrystalCell());
assertEquals("failed for space group short symbol pdb vs cif",
ciPdb.getSpaceGroup().getShortSymbol(), ciCif.getSpaceGroup().getShortSymbol());
CrystalCell ccPdb = ciPdb.getCrystalCell();
CrystalCell ccCif = ciCif.getCrystalCell();
assertEquals("failed for cell A:",ccPdb.getA(),ccCif.getA(),DELTA);
assertEquals("failed for cell B:",ccPdb.getB(),ccCif.getB(),DELTA);
assertEquals("failed for cell C:",ccPdb.getC(),ccCif.getC(),DELTA);
assertEquals("failed for cell Alpha:",ccPdb.getAlpha(),ccCif.getAlpha(),DELTA);
assertEquals("failed for cell Beta:",ccPdb.getBeta(),ccCif.getBeta(),DELTA);
assertEquals("failed for cell Gamma:",ccPdb.getGamma(),ccCif.getGamma(),DELTA);
if (ciPdb.getNcsOperators()==null) {
assertTrue(ciCif.getNcsOperators()==null);
} else {
Matrix4d[] ncsOpersPdb = ciPdb.getNcsOperators();
Matrix4d[] ncsOpersCif = ciCif.getNcsOperators();
assertEquals("Number of NCS operators don't coincide", ncsOpersPdb.length, ncsOpersCif.length);
for (int i=0;i<ncsOpersPdb.length;i++) {
assertTrue("NCS operators "+i+" don't coincide",ncsOpersPdb[i].epsilonEquals(ncsOpersCif[i], 0.0001));
}
}
}
// biological assemblies
// a) we don't test in non-crystallographic case because annotation is inconsistent between PDB and mmCIF,
// e.g. 2kli (NMR) has bioassembly annotation in mmCIF but not in PDB
// b) we don't test virus entries (we check via looking at ncs operators==null):
// they are inconsistent PDB vs mmCIF (e.g. 1pgw has no oligomeric size in PDB, and 120 in mmCIF)
if (isCrystallographic && hPdb.getCrystallographicInfo().getNcsOperators()==null
// 1ruh, 2ms2, 2r06: virus proteins with data consistency issue: it's missing the MTRXn record (so it appears as ncs operators==null)
&& (!sPdb.getPDBCode().equalsIgnoreCase("1ruh"))
&& (!sPdb.getPDBCode().equalsIgnoreCase("2ms2"))
&& (!sPdb.getPDBCode().equalsIgnoreCase("2r06"))) {
assertEquals("Number of bioassemblies doesn't coincide",
hPdb.getNrBioAssemblies(), hCif.getNrBioAssemblies());
Map<Integer,BioAssemblyInfo> batPdb = hPdb.getBioAssemblies();
Map<Integer,BioAssemblyInfo> batCif = hCif.getBioAssemblies();
assertEquals("Size of bioassemblies map doesn't coincide with nr of bioassemblies",
hPdb.getNrBioAssemblies(),batPdb.size());
assertEquals("Size of bioassemblies maps don't coincide",batPdb.size(), batCif.size());
for (int id:batPdb.keySet()) {
assertTrue("Bioassembly id is not contained in mmCIF",batCif.containsKey(id));
// there's an inconsistency in 4amh pdb vs mmCIF in mmSize
if (sPdb.getPDBCode().equalsIgnoreCase("4amh")) continue;
assertEquals("Macromolecular size of assembly "+id+" doesn't coincide",
batPdb.get(id).getMacromolecularSize(), batCif.get(id).getMacromolecularSize());
}
}
}
private void testChains(Structure sPdb, Structure sCif) throws StructureException {
assertNotNull(sPdb.getChains());
assertNotNull(sCif.getChains());
// sugar chains are badly annotated and inconsistent between pdb/mmcif
// let's skip this test if we have sugar entities
if (!containsSugar(sCif)) {
assertEquals(sPdb.getPolyChains().size(), sCif.getPolyChains().size());
// some entries like 3c5e are inconsistent in residue numbering for UNL (unknown) residues between pdb and mmcif
// skipping this test for them
if (!containsUNL(sCif)) {
assertEquals(sPdb.getNonPolyChains().size(), sCif.getNonPolyChains().size());
}
assertEquals(sPdb.getWaterChains().size(), sCif.getWaterChains().size());
if (!containsUNL(sCif)) {
assertEquals(sPdb.getChains().size(),sCif.getChains().size());
}
}
Set<String> chainIds = new TreeSet<String>();
for (Chain chain:sPdb.getPolyChains()){
chainIds.add(chain.getName());
}
for (String chainId:chainIds) {
testSingleChain(sPdb.getPolyChainByPDB(chainId), sCif.getPolyChainByPDB(chainId));
}
}
private void testSingleChain(Chain cPdb, Chain cCif) {
assertNotNull(cPdb);
assertNotNull(cCif);
String chainId = cPdb.getName();
assertEquals("failed for getName():",cPdb.getName(),cCif.getName());
// TODO no internalChainID if parsed from PDB, should an ID be assigned following the same rules as in mmCIF?
//assertEquals("failed for getInternalChainID():",cPdb.getInternalChainID(),cCif.getInternalChainID());
assertNotNull("getId is null",cCif.getId());
assertTrue("id used in mmCIF files must be at most 4 characters",cCif.getId().length()<=4);
assertEquals("chainID must be 1 character only, failed for pdb", 1, cPdb.getName().length());
assertEquals("chainID must be 1 character only, failed for cif", 1, cCif.getName().length());
// getCompound() is some times null for badly formatted PDB files (e.g. 4a10, all waters are in a separate chain F)
if (isPolymer(cPdb)) {
assertNotNull("getCompound is null in pdb (chain "+chainId+")",cPdb.getEntityInfo());
assertNotNull("getCompound is null in cif (chain "+chainId+")",cCif.getEntityInfo());
// for some badly formatted entries there are mismatches of mol_ids on pdb cs mmcif, e.g. 2efw
// we thus count them and only warn at the end
int molIdPdb = cPdb.getEntityInfo().getMolId();
int molIdCif = cCif.getEntityInfo().getMolId();
if (molIdPdb!=molIdCif) {
logger.warn("Mismatching mol_id (entity_id) for {}. pdb: {}, mmCIF: {}",pdbId,molIdPdb,molIdCif);
pdbIdsWithMismatchingMolIds.add(pdbId);
}
}
assertNotNull("getParent is null in pdb (chain "+chainId+")",cPdb.getStructure());
assertNotNull("getParent is null in cif (chain "+chainId+")",cCif.getStructure());
assertEquals("failed for getAtomLength (chain "+chainId+"):",cPdb.getAtomLength(),cCif.getAtomLength());
// entries with polymers composed of all unknowns (giving only-X sequences) can't be aligned seqres-to-atom (for PDB files)
// we've got to skip them because they won't have seqres groups
// e.g. is 1jnv chain A
if (cPdb.getAtomSequence().matches("^X+$")) return;
// note for getSeqResLength to work one needs the setAlignSeqRes option in the parsers
assertEquals("failed for getSeqResLength pdb vs cif (chain "+chainId+"):",
cPdb.getSeqResLength(),cCif.getSeqResLength());
assertEquals("failed for getSeqResGroups().size pdb vs cif",
cPdb.getSeqResGroups().size(), cCif.getSeqResGroups().size());
assertEquals("getSeqResLength and getSeqResGroups.size should coincide in pdb:",
cPdb.getSeqResLength(),cPdb.getSeqResGroups().size());
assertEquals("getSeqResLength and getSeqResGroups.size should coincide in cif:",
cCif.getSeqResLength(),cCif.getSeqResGroups().size());
assertEquals("failed for getAtomLength:",cPdb.getAtomLength(),cCif.getAtomLength());
assertEquals("failed for getAtomGroups().size pdb vs cif",
cPdb.getAtomGroups().size(), cCif.getAtomGroups().size());
assertEquals("getAtomLength and getAtomGroups.size should coincide in pdb:",
cPdb.getAtomLength(),cPdb.getAtomGroups().size());
assertEquals("getAtomLength and getAtomGroups.size should coincide in cif:",
cCif.getAtomLength(),cCif.getAtomGroups().size());
assertEquals("failed for getAtomGroups(GroupType.AMINOACID) pdb vs cif:",
cPdb.getAtomGroups(GroupType.AMINOACID).size(),cCif.getAtomGroups(GroupType.AMINOACID).size());
assertEquals("failed for getAtomGroups(GroupType.HETATM) pdb vs cif:",
cPdb.getAtomGroups(GroupType.HETATM).size(),cCif.getAtomGroups(GroupType.HETATM).size());
assertEquals("failed for getAtomGroups(GroupType.NUCLEOTIDE) pdb vs cif:",
cPdb.getAtomGroups(GroupType.NUCLEOTIDE).size(),cCif.getAtomGroups(GroupType.NUCLEOTIDE).size());
// In 4imj, chain F there's an alignment ambiguity because of a repeat, so the seqres to atom alignment
// doesn't work properly for it, we skip the rest of the test for this chain
if (cPdb.getStructure().getPDBCode().equals("4IMJ") && cPdb.getName().equals("F")) return;
assertEquals("failed for getSeqResGroups(GroupType.AMINOACID) pdb vs cif:",
cPdb.getSeqResGroups(GroupType.AMINOACID).size(),cCif.getSeqResGroups(GroupType.AMINOACID).size());
assertEquals("failed for getAtomGroups(GroupType.HETATM) pdb vs cif:",
cPdb.getSeqResGroups(GroupType.HETATM).size(),cCif.getSeqResGroups(GroupType.HETATM).size());
assertEquals("failed for getAtomGroups(GroupType.NUCLEOTIDE) pdb vs cif:",
cPdb.getSeqResGroups(GroupType.NUCLEOTIDE).size(),cCif.getSeqResGroups(GroupType.NUCLEOTIDE).size());
assertTrue("getAtomLength must be at least 1 in length (chain "+chainId+")",cPdb.getAtomLength()>=1);
if (isPolymer(cPdb)) {
// some badly formatted PDB files (e.g. 4a10, all waters are in a separate chain F) have 0 seqres length for some chains
assertTrue("getSeqResLength must be at least 1 in length (chain "+chainId+")",cPdb.getSeqResLength()>=1);
}
// in the current implementation this is not a valid test, entries that have aminoacid residues in
// ligands, e.g. 3o6g won't pass this test
//assertTrue("getSeqResLength ("+cPdb.getSeqResLength()+") must be >= than getAtomGroups(GroupType.AMINOACID).size() ("+
// cPdb.getAtomGroups(GroupType.AMINOACID).size()+") (chain "+chainName+")",
// cPdb.getSeqResLength()>=cPdb.getAtomGroups(GroupType.AMINOACID).size());
int allAtomGroupsSizePdb =
cPdb.getAtomGroups(GroupType.AMINOACID).size()+
cPdb.getAtomGroups(GroupType.HETATM).size()+
cPdb.getAtomGroups(GroupType.NUCLEOTIDE).size();
int allAtomGroupsSizeCif =
cCif.getAtomGroups(GroupType.AMINOACID).size()+
cCif.getAtomGroups(GroupType.HETATM).size()+
cCif.getAtomGroups(GroupType.NUCLEOTIDE).size();
assertEquals("failed for sum of all atom group sizes (hetatm+nucleotide+aminoacid) pdb vs mmcif",allAtomGroupsSizePdb,allAtomGroupsSizeCif);
assertEquals("failed for getAtomLength==hetatm+aminos+nucleotide",cPdb.getAtomLength(), allAtomGroupsSizePdb);
int allSeqResGroupsSizePdb =
cPdb.getSeqResGroups(GroupType.AMINOACID).size()+
cPdb.getSeqResGroups(GroupType.HETATM).size()+
cPdb.getSeqResGroups(GroupType.NUCLEOTIDE).size();
int allSeqResGroupsSizeCif =
cCif.getSeqResGroups(GroupType.AMINOACID).size()+
cCif.getSeqResGroups(GroupType.HETATM).size()+
cCif.getSeqResGroups(GroupType.NUCLEOTIDE).size();
assertEquals("failed for sum of all seqres group sizes (hetatm+nucleotide+aminoacid) pdb vs mmcif",allSeqResGroupsSizePdb,allSeqResGroupsSizeCif);
assertEquals("failed for getSeqResLength==hetatm+aminos+nucleotide",cPdb.getSeqResLength(), allSeqResGroupsSizePdb);
}
private Structure getPdbStructure(String pdbId) throws IOException, StructureException {
cache.setUseMmCif(false);
// set parsing params here:
params.setAlignSeqRes(true);
//params.setLoadChemCompInfo(true);
params.setParseBioAssembly(true);
return cache.getStructure(pdbId);
}
private Structure getCifStructure(String pdbId) throws IOException, StructureException {
cache.setUseMmCif(true);
// set parsing params here:
params.setAlignSeqRes(true);
//params.setLoadChemCompInfo(true);
params.setParseBioAssembly(true);
return cache.getStructure(pdbId);
}
/**
* Reads a file containing a list of PDB codes.
* Lines starting with "#" will be treated as comments
* Will stop reading after finding an empty line, this is useful to quickly test a modified list.
* @param testSetFile
* @return
* @throws IOException
*/
private List<String> readTestSetFile(String testSetFile) throws IOException {
InputStream inStream = this.getClass().getResourceAsStream(testSetFile);
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
List<String> list = new ArrayList<String>();
String line;
while ((line=br.readLine())!=null) {
if (line.startsWith("#")) continue;
if (line.isEmpty()) break;
if (!line.matches("\\d\\w\\w\\w"))
throw new IllegalArgumentException("The input test set "+testSetFile+" contains an invalid PDB code: "+line);
list.add(line);
}
br.close();
return list;
}
private boolean isPolymer(Chain chain) {
for (Group group : chain.getSeqResGroups()) {
if ((group instanceof AminoAcid) || (group instanceof NucleotideImpl)) {
return true;
}
}
// not a single amino-acid or nucleotide, must be something not polymeric
return false;
}
private boolean containsSugar(Structure s) {
for (EntityInfo e:s.getEntityInfos()) {
if (e.getDescription().contains("SUGAR")) return true;
}
return false;
}
private boolean containsUNL(Structure s) {
for (Chain c:s.getNonPolyChains()) {
for (Group g:c.getAtomGroups()) {
if (g.getPDBName().equals("UNL")) return true;
}
}
return false;
}
}